package com.meiyuetao.myt.crawl.filter;

import java.math.BigDecimal;
import java.text.ParseException;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;

import lab.s2jh.core.pagination.GroupPropertyFilter;
import lab.s2jh.core.pagination.PropertyFilter;
import lab.s2jh.core.pagination.PropertyFilter.MatchType;
import lab.s2jh.crawl.filter.ParseFilterChain;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.data.domain.Sort.Direction;
import org.springframework.util.Assert;
import org.springframework.util.DigestUtils;

import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.meiyuetao.myt.crawl.entity.ParseCommodity;
import com.meiyuetao.myt.md.entity.Commodity;
import com.meiyuetao.myt.md.entity.CommodityComment;

public class JdSingleParseFilter extends AbstractCommodityParseFilter {

    private static final Logger logger = LoggerFactory.getLogger(JdSingleParseFilter.class);

    private final static int MAX_REVIEW_PAGES = 5;

    private static final String[] ALERT_DOMAIN_INFOS = { "jd.com", "360buy.com", "3.cn" };

    @Override
    public void doFilterInternal(String url, ParseFilterChain filterChain) {
        logger.debug("Invoking {} ...", this.getClass());
        HtmlPage htmlPage = fetchHtmlPage(url);

        String sn = null;
        Matcher matcher = this.urlMatchPattern.matcher(url);
        if (matcher.find()) {
            sn = matcher.group(1);
        }
        Assert.notNull(sn);
        List<?> cateList = htmlPage.getByXPath("//DIV[@class='breadcrumb']//A");
        List<String> categories = Lists.newArrayList();
        for (int i = 0; i < cateList.size() - 1; i++) {
            HtmlElement node = (HtmlElement) cateList.get(i);
            categories.add(node.getTextContent().trim());
        }
        ParseCommodity parseCommodity = parseCommodityService.findByProperty("baseUrl", url);
        if (parseCommodity == null) {
            parseCommodity = new ParseCommodity(url);
            // 设置来源分组标识，一般取对应站点主域名即可
            parseCommodity.setSourceType("jd.com");
            // 设置商品源编码
            parseCommodity.setSourceCode(sn);
            parseCommodity.setUid(url);

        }
        parseCommodity.reset();
        if (true) {
            Commodity commodity = commodityService.findByProperty("sourceUrl", url);
            if (commodity != null) {

                // 评论
                // http://club.jd.com/review/623278-0-1-0.html
                // http://club.jd.com/review/623278-0-2-0.html
                boolean breakLoop = false;
                // 获取最近评论,用于判断抓取评论是否已存在
                GroupPropertyFilter groupPropertyFilter = GroupPropertyFilter.buildDefaultAndGroupFilter();
                groupPropertyFilter.append(new PropertyFilter(MatchType.EQ, "commodity", commodity));
                List<CommodityComment> commodityComments = commodityCommentService.findByFilters(groupPropertyFilter,
                        new org.springframework.data.domain.Sort(Direction.DESC, "sid")).subList(0, 5);

                for (int pager = 1; pager <= MAX_REVIEW_PAGES; pager++) {
                    if (breakLoop) {
                        break;
                    }
                    String reviewUrl = "http://club.jd.com/review/" + sn + "-0-" + pager + "-0.html";
                    logger.debug("Processing parse customer review page: {}", reviewUrl);
                    // 基于Htmlunit抓取页面，并且后续用Htmlunit语法获取相关元素值
                    HtmlPage page = fetchHtmlPage(reviewUrl);

                    List<?> nodeList = page.getByXPath("//DIV[@id='comments-list']//DIV[@class='mc']");
                    // 判断如果当前页面已经没有评论行项元素，则终止本商品评论抓取循环
                    if (nodeList == null || nodeList.size() == 0) {
                        break;
                    }

                    for (int i = 0; i < nodeList.size(); i++) {
                        HtmlElement node = (HtmlElement) nodeList.get(i);
                        // 获取当前行项元素标识（主要用于处理XPath语法每次都是从Document获取元素的Bug，因此需要在XPath变量上面添加行项标识信息）
                        String itemDivId = node.getAttribute("id");
                        String uname = getXPathValue(node, "//DIV[@id='" + itemDivId + "']//DIV[@class='u-name']/A");
                        // HtmlElement uNode =
                        // node.getFirstByXPath("//DIV[@id='" + itemDivId
                        // + "']//DIV[@class='u-name']/A");
                        // String ulink = uNode.getAttribute("href");
                        String dateComment = getXPathValue(node, "//DIV[@id='" + itemDivId + "']//SPAN[@class='date-comment']");
                        // 判断当前评论是否已存在
                        CommodityComment commodityComment = null;
                        if (commodityComments != null) {
                            for (CommodityComment review : commodityComments) {
                                if (DigestUtils.md5DigestAsHex((uname + dateComment).getBytes()).equals(review.getIdentification())) {
                                    commodityComment = review;
                                    break;
                                }
                            }
                        }
                        // 已存在评论,则终止整个循环
                        if (commodityComment != null) {
                            breakLoop = true;
                            break;
                        }
                        commodityComment = new CommodityComment();
                        // 用户名
                        commodityComment.setDisplayName(uname);
                        // 标题
                        String title = getXPathValue(node, "//DIV[@id='" + itemDivId + "']//DIV[@class='o-topic']//STRONG[@class='topic']//A");
                        commodityComment.setTitle(title);
                        List<?> commentContentList = node.getByXPath("//DIV[@id='" + itemDivId + "']//DIV[@class='comment-content']//DL");

                        if (commentContentList != null && commentContentList.size() > 0) {
                            for (int j = 0; j < commentContentList.size(); j++) {
                                HtmlElement contentNode = (HtmlElement) commentContentList.get(j);
                                contentNode.setAttribute("id", "dl" + j);
                                String name = getXPathValue(contentNode, "//DL[@id='dl" + j + "']//DT");
                                String value = getXPathValue(contentNode, "//DL[@id='dl" + j + "']//DD");
                                if (name.equals("优 点：")) {
                                    commodityComment.setAdvantage(value);
                                }
                                if (name.equals("不 足：")) {
                                    commodityComment.setDisadvantage(value);
                                }
                                if (name.equals("心 得：")) {
                                    commodityComment.setGainedKnowledge(value);
                                }
                                if (name.equals("购买日期：")) {
                                    // commodityComment.setBoughtTime(DateTools.stringToDate(value));
                                    try {
                                        commodityComment.setBoughtTime(DateUtils.parseDate(value, new String[] { "yyyy-MM-dd" }));
                                    } catch (ParseException e) {
                                        // TODO Auto-generated catch block
                                        e.printStackTrace();
                                    }
                                }
                                contentNode.removeAttribute("id");
                            }
                        }
                        // 用户名,标题,有点,缺点,心得.包含敏感词的,不记录评论
                        if (isContainSensitiveWord(commodityComment.getTitle()) || isContainSensitiveWord(commodityComment.getDisplayName())
                                || isContainSensitiveWord(commodityComment.getAdvantage()) || isContainSensitiveWord(commodityComment.getDisadvantage())
                                || isContainSensitiveWord(commodityComment.getGainedKnowledge())) {
                            continue;

                        }
                        // 商品sid
                        commodityComment.setCommodity(commodity);
                        ;
                        // 评论时间
                        try {
                            commodityComment.setPublishTime(DateUtils.parseDate(dateComment, new String[] { "yyyy-MM-dd HH:mm" }));
                        } catch (ParseException e) {
                            // TODO Auto-generated catch block
                            e.printStackTrace();
                        }
                        // 购买时间
                        commodityComment.setBoughtTime(createBoughtTime(commodityComment.getPublishTime()));
                        commodityComment.setCommentFrom("jd.com");
                        // 标识评论唯一性字段
                        commodityComment.setIdentification(DigestUtils.md5DigestAsHex((uname + dateComment).getBytes()));

                        commodityComment.setShowPicCount(0);
                        commodityComment.setReplyCount(0);
                        commodityComment.setUsefulCount(0);
                        commodityComment.setUselessCount(0);

                        HtmlElement starNode = node.getFirstByXPath("//DIV[@id='" + itemDivId + "']//DIV[@class='o-topic']/SPAN[1]");
                        String startStr = starNode.getAttribute("class");
                        startStr = startStr.substring(7, startStr.length());
                        commodityComment.setEvalValue(new BigDecimal(startStr));
                        HtmlElement commentNode = node.getFirstByXPath("//DIV[@id='" + itemDivId + "']//DIV[@class='comment-content']");

                        commodityComment.setCommentContent(commentNode.toString());
                        commodityCommentService.save(commodityComment);
                    }
                }
            }
        }
        // 商品所属分类路径
        parseCommodity.setCategoryPath(StringUtils.join(categories, ">"));
        String title = parseTitle(htmlPage, "//DIV[@id='name']/H1");
        parseCommodity.setTitle(title);

        // 销售价格属性处理
        parseSalePrice(parseCommodity, htmlPage, "//DIV[@id='summary-price']//STRONG[@id='jd-price']", "//DIV[@id='summary-price']//STRONG[@id='jd-price']");
        // 商品描述
        parseDescription(parseCommodity, htmlPage, "//DIV[@id='product-detail-1']//DIV[@class='detail-content']", "//DIV[@id='product-detail-1']//IMG", "data-lazyload",
                ALERT_DOMAIN_INFOS);
        // 橱窗图
        parseWindowImgs(parseCommodity, htmlPage, "//DIV[@id='spec-list']//DIV[@class='spec-items']//IMG", "//DIV[@id='spec-n1']//IMG", "data-lazyload");
        // 促销口号
        parseSalePrompt(parseCommodity, htmlPage, "//DIV[@id='product-promotions']");
        // 库存属性处理
        parseSaleStock(parseCommodity, htmlPage, "//DIV[@id='store-prompt']//STRONG");
        logger.debug("Saving Parse Commodity: {}", parseCommodity);
        DateTime dTime = new DateTime();
        parseCommodity.setLastFetchTime(dTime.getMillis());
        parseCommodity.setLastFetchTimeLabel(dTime.toString("yyyy-MM-dd HH:mm:ss"));
        parseCommodityService.save(parseCommodity);

    }

    @Override
    public Map<String, Object> parseSimpleData(String url) {
        if (isAcceptUrl(url)) {
            Map<String, Object> jsonMap = Maps.newLinkedHashMap();
            HtmlPage htmlPage = fetchHtmlPage(url);
            /*
             * String title = parseTitle(htmlPage, "//DIV[@id='name']/H1");
             * jsonMap.put("title", title);
             */
            HtmlElement salePriceNode = htmlPage.getFirstByXPath("//DIV[@id='summary-price']//STRONG[@id='jd-price']");
            String salePrice = "";
            if (salePriceNode != null) {
                salePrice = salePriceNode.asText();
            }
            salePrice = cleanInvisibleChar(salePrice);
            if (StringUtils.isNotBlank(salePrice)) {

                char c = salePrice.trim().charAt(0);
                if (c > '9' || c < '0') {
                    salePrice = salePrice.substring(1, salePrice.length());
                }
                jsonMap.put("salePrice", salePrice);
            } else {
                return null;
            }
            /*
             * HtmlElement picNode =
             * htmlPage.getFirstByXPath("//DIV[@id='spec-n1']//IMG"); if
             * (picNode != null) { String src = parseImgSrc(url,
             * picNode.getAttribute("data-lazyload"));
             * //有些商品描述没有采用LazyLoad机制,则直接取src值
             * ,如:http://www.suning.com/emall/prd_10052_10051_-7_5017367_.html
             * if (StringUtils.isBlank(src)) { src = parseImgSrc(url,
             * picNode.getAttribute("src")); jsonMap.put("pic", src); } } else {
             * return null; }
             */
            return jsonMap;
        }
        return null;
    }

}
